Testing primers against PR2
Testing primers against PR2
1 Goal
Display primer set matches against the PR2 database (latest version 4.12.0)
Matches are computed by an independent R script (PR2 Primers pr2_match.R) and stored in an rda file which is read by this Rmd file.
In this version # of mismatches is computed as well as position of mismatches
1.1 To do
- Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)
2 Initialize
Load the variables common to the different scripts and the necessary libraries
# Libraries for bioinfo ----------------------------------------------------
library("Biostrings")
# Libraries tidyr ---------------------------------------------------------
library("ggplot2")
theme_set(theme_light())
library("dplyr")
library("readxl")
library(openxlsx)
library("tibble")
library("readr")
library("purrr")
library("forcats")
library("lubridate")
library("stringr")
# Libraries dvutils and pr2database -------------------------------------------------------
if(any(grepl("package:dvutils", search()))) detach("package:dvutils", unload=TRUE)
library("dvutils")
# if(any(grepl("package:pr2database", search()))) detach("package:pr2database", unload=TRUE)
# library("pr2database")
# Options for knitting the report -------------
library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=TRUE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="500")
options(knitr.kable.NA = '')3 Constants
# gene_selected = '16S rRNA'
# for Geisen paper
max_mismatch = 2
gene_selected = "18S_rRNA" # Can be 18S_rRNA or 16_rRNA
rda_file_label = "_all" # This label is added at the end of the rda file name
gene_regions = c("V4", "V9")
primer_sets_excluded = c(73, 74) # These primers should not be included (semi nested PCR)
sequence_length_min = 1500
sequence_length_min_V9 = 1650
sequence_length_min_V4 = 12004 Read primer file
4.1 Build the primer set Table
Only keep the 18S primers with V region
# Read from local database
pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)
primers <- tbl(pr2_db_con, "pr2_primers") %>% collect()
primer_sets_all <- tbl(pr2_db_con, "pr2_primer_sets") %>% collect()
disconnect <- db_disconnect(pr2_db_con)
if (gene_selected == "18S_rRNA") {
primer_sets <- primer_sets_all %>% filter(gene == "18S rRNA", !is.na(reference_doi),
!str_detect(gene_region, "ITS|cloning|full"), str_detect(gene_region, "^V")) # Start with V
gene_regions_all = unique(primer_sets$gene_region)
print
} else if (gene_selected == "16S_rRNA") {
primer_sets <- primer_sets %>% filter(specificity == "plastid" | (gene == "18S rRNA" &
specificity == "universal"))
}function (x, ...)
UseMethod("print")
<bytecode: 0x000000001587c5b0>
<environment: namespace:base>
primer_sets <- primer_sets %>% left_join(select(primers, primer_id, fwd_name = name,
fwd_seq = sequence, fwd_start_yeast = start_yeast, fwd_end_yeast = end_yeast),
by = c(fwd_id = "primer_id")) %>% left_join(select(primers, primer_id, rev_name = name,
rev_seq = sequence, rev_start_yeast = start_yeast, rev_end_yeast = end_yeast),
by = c(rev_id = "primer_id")) %>% mutate(length_yeast = rev_end_yeast - fwd_start_yeast +
1) %>% select(gene_region, specificity, primer_set_id, contains("fwd"), contains("rev"),
length_yeast, reference:remark) %>% select(-fwd_id, -rev_id) %>% arrange(gene_region,
fwd_start_yeast, rev_start_yeast) %>% filter(!(primer_set_id %in% primer_sets_excluded)) %>%
mutate(specific = ifelse(is.na(specificity), "general", "specific")) %>% relocate(specific,
.before = specificity)
# , lubridate::today()
file_name = str_c("../output/Table_primers_", gene_selected, ".xlsx")
write.xlsx(primer_sets, file = file_name, firstActiveRow = 2)
n_primers <- list()
n_primers[["general"]] <- nrow(filter(primer_sets, specific == "general"))
n_primers[["specific"]] <- nrow(filter(primer_sets, specific == "specific"))
n_primers$general
[1] 33
$specific
[1] 22
knitr::kable(select(primer_sets, primer_set_id, fwd_name, rev_name, gene_region,
length_yeast)) %>% kableExtra::kable_styling()| primer_set_id | fwd_name | rev_name | gene_region | length_yeast |
|---|---|---|---|---|
| 81 | 18SV1V2F | 18SV1V2R | V1-V2 | 341 |
| 80 | F04 | R22 | V1-V2 | 402 |
| 72 | Kineto_80 | Kineto_651 | V2-V3 | 667 |
| 59 | 152+ | 528- | V2-V3 | 352 |
| 69 | Chryso_240 | Chryso_651 | V2-V3 | 545 |
| 84 | SAR_V3_F | SAR_V3_R | V3 | 182 |
| 62 | Cer2F | Cer1R | V3-V4 | 588 |
| 88 | Par_18S-F | Par_18S-R | V3-V4 | 562 |
| 40 | Uni18SF | Uni18SR | V4 | 461 |
| 13 | 3NDf | V4_euk_R1 | V4 | 465 |
| 14 | 3NDf | V4_euk_R2 | V4 | 458 |
| 12 | 3NDf | 1132rmod | V4 | 599 |
| 34 | 515FY | NSR951 | V4 | 391 |
| 35 | EUK581-F | EUK1134-R | V4 | 578 |
| 33 | 515F Univ | 926R | V4 | 589 |
| 18 | 515F | 1119r | V4 | 598 |
| 4 | 563f | 1132r | V4 | 587 |
| 16 | TAReuk454FWD1 | V4 18S Next.Rev | V4 | 417 |
| 7 | V4_1f | TAReukREV3 | V4 | 417 |
| 8 | TAReuk454FWD1 | TAReukREV3 | V4 | 417 |
| 36 | TAReuk454FWD1 | V4RB | V4 | 417 |
| 86 | EuF-V4 | picoR2 | V4 | 425 |
| 90 | TAReuk454FWD1 | V4r | V4 | 417 |
| 19 | Claudia Vannini (F) | Claudia Vannini (R) | V4 | 439 |
| 1 | F-566 | R-1200 | V4 | 635 |
| 15 | EUKAF | EUKAR | V4 | 410 |
| 17 | E572F | E1009R | V4 | 438 |
| 25 | NSF563 | NSR951 | V4 | 380 |
| 2 | A-528F | R-952 | V4 | 379 |
| 39 | A-528F | PRYM01+7 | V4 | 396 |
| 83 | A-528F | 1132r | V4 | 577 |
| 3 | 574*f | 1132r | V4 | 576 |
| 77 | 574f | 1132r | V4 | 576 |
| 23 | 590F | 1300R | V4 | 720 |
| 21 | D512for | D978rev | V4 | 437 |
| 41 | Cerc479F | Cerc750R | V4 | 283 |
| 24 | EK-565F-NGS | EUK1134-R | V4 | 527 |
| 65 | S616F_Cerco | S947R_Cerco | V4 | 348 |
| 66 | S616F_Eocer | S947R_Cerco | V4 | 348 |
| 63 | S616F_Cerco | S963R_Cerco | V4 | 367 |
| 64 | S616F_Eocer | S963R_Cerco | V4 | 367 |
| 5 | 616f | 1132r | V4 | 534 |
| 6 | 616*f | 1132r | V4 | 534 |
| 38 | ChloroF | ChloroR | V5 | 494 |
| 37 | DimA | DimB | V5 | 295 |
| 87 | Oxy_18S-F | Oxy_18S-R | V5 | 444 |
| 32 | 926wF | 1392-R | V6-V8 | 513 |
| 76 | F-1183 | R-1443 | V7 | 261 |
| 92 | FF390 | FR1 | V7-V8 | 349 |
| 67 | 1301f | 1801r | V7-V9 | 539 |
| 89 | V8f | 1510R | V8-V9 | 372 |
| 28 | 1380F | 1510R | V9 | 176 |
| 29 | 1389F | 1510R | V9 | 167 |
| 31 | 1388F | 1510R | V9 | 167 |
| 27 | 1391F | EukB | V9 | 169 |
5 Computing the matches
This part is done by an R script script_primers_pr2_match.R that is executed in batch mode on Roscoff server.
5.1 Programing Notes
- Use Biostrings
Accessor methods : In the code snippets below, x is an MIndex object.
- length(x): The number of patterns that matches are stored for.
- names(x): The names of the patterns that matches are stored for.
- startIndex(x): A list containing the starting positions of the matches for each pattern.
- endIndex(x): A list containing the ending positions of the matches for each pattern.
- elementNROWS(x): An integer vector containing the number of matches for each pattern.
- x[[i]]: Extract the matches for the i-th pattern as an IRanges object.
- unlist(x, recursive=TRUE, use.names=TRUE): Return all the matches in a single IRanges object. recursive and use.names are ignored.
- extractAllMatches(subject, mindex): Return all the matches in a single XStringViews object.
One could also use another function which does not give the position * match_fwd <- vcountPattern(fwd, seq,max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=FALSE, algorithm=“auto”)
6 Load the primer match files
This avoids recomputing each time.
All sequences for which introns have been removed are filtered out (contain "_UC" in pr2_accession)
6.1 Build the file for all primer sets - DO ONLY ONCE
pr2_match_list <- list()
for (i in 1:nrow(primer_sets)) {
cat("i = ", i, "\n")
rda_file_label <- str_c(sprintf("_set_%03d", primer_sets$primer_set_id[i]), "_mismatches_",
max_mismatch)
load(file = str_c("../output/individual primers/pr2_match_", gene_selected, rda_file_label,
".rda"))
pr2_match_list[[i]] <- pr2_match
}
pr2_match_final <- pr2_match_list %>% reduce(bind_rows)
saveRDS(pr2_match_final, file = str_c("../output/pr2_match_", gene_selected, "_mismatches_",
max_mismatch, ".rds"))6.2 Read the file for all primer sets and filter
# For testing load(file=str_c('../output/pr2_match_', gene_selected ,'_test_mismatches_', max_mismatch, '.rda'))
pr2_match_final <- readRDS(file = str_c("../output/pr2_match_", gene_selected, "_mismatches_", max_mismatch, ".rds"))
print(str_c("Before filtration: ", nrow(pr2_match_final)))[1] "Before filtration: 8447679"
# Replace the gene_region and primer_label since they can be changed in the database
primer_sets_label <- primer_sets %>% mutate(primer_label = str_c(str_sub(gene_region, 1, 2), sprintf("%02d", primer_set_id), str_sub(str_replace_na(specificity,
replacement = ""), 1, 3), sep = "_")) %>% # Remove the last underscore if left by itself
mutate(primer_label = str_replace(primer_label, "_$", "")) %>% select(primer_set_id, primer_label, gene_region, specific, specificity)
pr2_match_final <- pr2_match_final %>% # Remove sequences for which the introns have been removed
filter(!str_detect(pr2_accession, "_UC")) %>% # Remove sequence that are shorter
filter((str_detect(gene_region, "V4") & sequence_length >= sequence_length_min_V4) | (str_detect(gene_region, "V9") & sequence_length >=
sequence_length_min_V9) | (!str_detect(gene_region, "V4|V9") & sequence_length >= sequence_length_min)) %>% # remove '_' from primer_labels
mutate(primer_label = str_replace_all(primer_label, "_", " "), mismatch_number = fwd_mismatch_number + rev_mismatch_number) %>%
# Only keep the selected primers
filter(primer_set_id %in% primer_sets$primer_set_id) %>% # Remove primer_label which is created in the line
select(-primer_label, -gene_region) %>% left_join(primer_sets_label)
print(str_c("After filtration: ", nrow(pr2_match_final)))[1] "After filtration: 4975722"
7 Summarize the data in tables
7.1 Function
primer_summary <- function(pr2_match, taxo_level) {
summary <- pr2_match %>% group_by({
{
taxo_level
}
}, gene_region, primer_label, primer_set_id, specific, specificity) %>% summarize(n_seq = n(),
fwd_number = sum(!is.na(fwd_pos)), fwd_pct = fwd_number/n_seq * 100, fwd_mismatch_0_pct = sum(fwd_mismatch_number ==
0, na.rm = TRUE)/fwd_number * 100, fwd_mismatch_1_pct = sum(fwd_mismatch_number ==
1, na.rm = TRUE)/fwd_number * 100, fwd_mismatch_2_pct = sum(fwd_mismatch_number ==
2, na.rm = TRUE)/fwd_number * 100, fwd_mismatch_pos = mean(fwd_mismatch_primer_position_3prime,
na.rm = TRUE), rev_number = sum(!is.na(rev_pos)), rev_pct = rev_number/n_seq *
100, rev_mismatch_0_pct = sum(rev_mismatch_number == 0, na.rm = TRUE)/rev_number *
100, rev_mismatch_1_pct = sum(rev_mismatch_number == 1, na.rm = TRUE)/rev_number *
100, rev_mismatch_2_pct = sum(rev_mismatch_number == 2, na.rm = TRUE)/rev_number *
100, rev_mismatch_pos = mean(rev_mismatch_primer_position_3prime, na.rm = TRUE),
ampli_number = sum(!is.na(ampli_size)), ampli_pct = ampli_number/n_seq *
100, ampli_size_mean = mean(ampli_size, na.rm = TRUE), ampli_size_sd = sd(ampli_size,
na.rm = TRUE), ampli_size_max = max(ampli_size, na.rm = TRUE), ampli_size_min = min(ampli_size,
na.rm = TRUE), ampli_mismatch_0_pct = sum(mismatch_number == 0, na.rm = TRUE)/n_seq *
100, ampli_mismatch_1_pct = sum(mismatch_number == 1, na.rm = TRUE)/n_seq *
100, ampli_mismatch_2_pct = sum(mismatch_number == 2, na.rm = TRUE)/n_seq *
100, ampli_mismatch_3_pct = sum(mismatch_number == 3, na.rm = TRUE)/n_seq *
100, ampli_mismatch_4_pct = sum(mismatch_number == 4, na.rm = TRUE)/n_seq *
100, ampli_mismatch_5_pct = sum(is.na(mismatch_number), na.rm = TRUE)/n_seq *
100) %>% mutate(across(contains(c("mismatch", "mean")), ~ifelse(is.nan(.x),
NA, .x))) %>% mutate(across(contains(c("size")), ~ifelse(is.infinite(.x),
NA, .x))) %>% ungroup()
# Compute the position of the primer for nearly complete sequences
summary_pos <- pr2_match %>% filter(sequence_length >= sequence_length_min_V9) %>%
group_by({
{
taxo_level
}
}, gene_region, primer_label, primer_set_id) %>% summarize(fwd_pos_mean = mean(fwd_pos,
na.rm = TRUE), rev_pos_mean = mean(rev_pos, na.rm = TRUE)) %>% ungroup()
summary <- summary %>% left_join(summary_pos)
}7.2 Summarize all eukaryotes
7.3 Summarize per supergroup
7.4 Summarize per class
pr2_match_summary_primer_set_class <- primer_summary(pr2_match_final, class)
pr2_taxo <- pr2_match_final %>% select(supergroup, division, class) %>% distinct()
pr2_match_summary_primer_set_class <- pr2_match_summary_primer_set_class %>% left_join(pr2_taxo) %>%
relocate(supergroup, division, .before = class)7.5 Save summaries (also in shiny)
saveRDS(pr2_match_summary_primer_set, file = str_c("../output/pr2_match_", gene_selected,
"_mismatches_", max_mismatch, "_summary.rds"))
saveRDS(pr2_match_summary_primer_set_sg, file = str_c("../output/pr2_match_", gene_selected,
"_mismatches_", max_mismatch, "_summary_sg.rds"))
saveRDS(pr2_match_summary_primer_set_class, file = str_c("../output/pr2_match_",
gene_selected, "_mismatches_", max_mismatch, "_summary_class.rds"))
# Shiny
saveRDS(pr2_match_summary_primer_set_class, file = str_c("../shiny/pr2_match_", gene_selected,
"_mismatches_", max_mismatch, "_summary_class.rds"))7.6 Read summaries
pr2_match_summary_primer_set <- readRDS(file = str_c("../output/pr2_match_", gene_selected,
"_mismatches_", max_mismatch, "_summary.rds"))
pr2_match_summary_primer_set_sg <- readRDS(file = str_c("../output/pr2_match_", gene_selected,
"_mismatches_", max_mismatch, "_summary_sg.rds"))
pr2_match_summary_primer_set_class <- readRDS(file = str_c("../output/pr2_match_",
gene_selected, "_mismatches_", max_mismatch, "_summary_class.rds"))
# Long form for Number of sequences
# This dataframe is used to re-order the bars correctly
pct_category_order <- data.frame(pct_category = c("ampli_pct", "fwd_pct", "rev_pct"),
pct_category_order = c(1, 3, 2))
pr2_match_summary_primer_set_long <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "pct_category",
values_to = "pct_seq", cols = fwd_pct:ampli_pct) %>% left_join(pct_category_order)8 Graphics
8.2 Amplicon length
8.2.1 Scatter
ggplot(pr2_match_final, aes(x = primer_label, y = ampli_size, group = primer_set_id,
color = as.factor(mismatch_number))) + geom_point(size = 3) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(x = "Primer set") + scale_color_viridis_d() + facet_wrap(vars(specific),
nrow = 2, scales = "free_x")8.2.2 Average size
ggplot(pr2_match_final, aes(x = primer_label, y = ampli_size, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_boxplot(outlier.alpha = 0.3) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(x = "Primer set") + scale_fill_viridis_d() + facet_wrap(vars(specific),
nrow = 2, scales = "free_x")8.3 Example with sets V4 and V9
8.3.1 Plot an example of amplicon distribution (Fig. 3)
for (one_primer_set in c(4, 29)) {
if (one_primer_set == 4) {
xmin = 570
xmax = 610
xmax2 = 2000
} else {
xmin = 130
xmax = 190
xmax2 = 1000
}
pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>%
filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
primer_label <- str_replace_all(pr2_match_final_one$primer_label[1], "_", " ")
g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue",
alpha = 0.9) + xlab("Amplicon size") + ggtitle(str_c("Primer set - ", primer_label)) +
xlim(xmin, xmax)
print(g)
g <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + geom_density(alpha = 0.9) +
theme_bw() + # theme(legend.position = 'none') +
guides(fill = guide_legend(nrow = 3)) + theme(legend.position = "top", legend.box = "horizontal") +
scale_fill_viridis_d(option = "inferno") + labs(x = "Amplicon size (bp)",
y = "Density", fill = "Supergroup") + # ggtitle(str_c('Primer set - ', primer_label)) +
xlim(xmin, xmax)
print(g)
fig3[[str_c(one_primer_set, " size_distri")]] <- g
g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_boxplot(outlier.alpha = 0.3) +
theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2)
print(g)
fig3[[str_c(one_primer_set, " size")]] <- g
g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() +
coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Supergroup") + ylab("Amplicon size (bp)")
print(g)
}8.3.2 Plot an example of percent amplification
for (one_primer_set in c(4, 29)) {
pr2_match_summary_filtered <- pr2_match_summary_primer_set_sg %>% filter((n_seq >
20) & (primer_set_id == one_primer_set) & !(supergroup %in% c("Apusozoa",
"Eukaryota_X")))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered,
aes(x = str_c(supergroup, " - n = ", n_seq), y = ampli_pct, fill = supergroup),
position = "dodge") + theme_bw() + # theme(legend.position = 'none') +
guides(fill = guide_legend(nrow = 2)) + theme(legend.position = "top", legend.box = "horizontal") +
scale_fill_viridis_d(option = "inferno") + coord_flip() + labs(x = "% of sequences amplified",
y = "", legend = "Supergroup") + scale_y_continuous(minor_breaks = seq(0,
100, by = 10), breaks = seq(0, 100, by = 20), limits = c(0, 100))
# ggtitle (str_c('Set - ', one_primer_set, ' - % amplified per Supergroup') )
print(g)
fig3[[str_c(one_primer_set, " pct")]] <- g
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(supergroup,
" - n = ", n_seq), y = ampli_size_mean), colour = "black") + coord_flip() +
geom_errorbar(aes(x = str_c(supergroup, " - n = ", n_seq), ymax = ampli_size_mean +
ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") +
ylab("Amplicon size (bp)") # +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle
# (str_c('Set - ', one_primer_set, ' - Amplicon sizes') ) + geom_hline(yintercept
# = c(450,550) , linetype= 2)
print(g)
}8.3.3 Plot an example of mismatches
for (one_primer_set in c(4, 29)) {
pr2_mismatches <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number",
values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>%
filter((n_seq > 20) & (primer_set_id == one_primer_set) & !(supergroup %in%
c("Apusozoa", "Eukaryota_X")))
g <- ggplot(pr2_mismatches, aes(x = str_c(supergroup, " - n = ", n_seq), y = mismatch_pct,
group = primer_set_id, fill = as.factor(mismatch_number))) + geom_col() +
theme_bw() + theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) +
labs(x = "Supergroup", y = "% of sequences with mismatches", fill = "Mismatches") +
scale_fill_viridis_d(direction = -1, alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) +
theme(legend.position = "top", legend.box = "horizontal") + coord_flip()
print(g)
fig3[[str_c(one_primer_set, "mismatches", sep = " ")]] <- g
print(g)
}8.3.4 Plot an example of mismatches positions
for (one_primer_set in c(4, 29)) {
df <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% filter(!(supergroup %in%
c("Apusozoa", "Eukaryota_X")))
g <- ggplot() + geom_histogram(data = filter(df, !is.na(fwd_mismatch_primer_position_3prime)),
aes(x = fwd_mismatch_primer_position_3prime, fill = supergroup), binwidth = 1,
stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() +
scale_x_continuous(minor_breaks = 0:30) + labs(y = "Density", x = "Position of mismatches from 3' end",
title = "fwd")
print(g)
fig3[[str_c(one_primer_set, "mismatches positions fwd", sep = " ")]] <- g
g <- ggplot() + geom_histogram(data = filter(df, !is.na(rev_mismatch_primer_position_3prime)),
aes(x = rev_mismatch_primer_position_3prime, fill = supergroup), binwidth = 1,
stat = "bin", alpha = 1) + scale_fill_viridis_d(option = "inferno") + theme_bw() +
scale_x_continuous(minor_breaks = 0:30) + labs(y = "Density", x = "Position of mismatches from 3' end",
title = "rev")
print(g)
fig3[[str_c(one_primer_set, "mismatches positions rev", sep = " ")]] <- g
}8.4 All Eukaryotes
Comments
- Percent of sequences amplified
- Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
- In general it is the reverse primer that causes problems.
- Some primer sets do not amplify any sequence (11, 19, 20, 21)
- For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
- Amplicon sizes
- Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
- This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
- Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
8.4.1 Plot only V4 and V9
fig1 <- list()
for (one_region in gene_regions) {
pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>%
filter(gene_region == one_region) %>% # Remove the group specific primers
filter(specific == "general") %>% filter(pct_category %in% c("ampli_pct", "fwd_pct",
"rev_pct"))
pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(gene_region ==
one_region) %>% # Remove the group specific primers)
filter(specific == "general")
pr2_match_region <- pr2_match_final %>% filter(gene_region == one_region) %>%
# Remove the group specific primers
filter(specific == "general")
# % Ampli
g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = primer_label,
y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), width = 0.7,
position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") +
scale_fill_manual(name = "", values = c(ampli_pct = "black", fwd_pct = "grey80",
rev_pct = "grey40"), labels = c("Amplicons", "Primer rev", "Primer fwd")) +
ggtitle(str_c(one_region, " - Percentage of sequences recovered")) + theme(axis.text.x = element_text(angle = 0,
hjust = 1)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) +
theme(legend.position = "top", legend.box = "horizontal")
print(g)
# Size
fig1[[str_c(one_region, "pct", sep = " ")]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) +
geom_point(aes(x = primer_label, y = ampli_size_mean), colour = "black") +
geom_errorbar(aes(x = primer_label, ymax = ampli_size_mean + ampli_size_sd,
ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") +
ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 850) +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) + coord_flip()
print(g)
fig1[[str_c(one_region, "size", sep = " ")]] <- g
# Mismatches
pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number",
values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>%
filter(gene_region == one_region) %>% # Remove the group specific primers
filter(specific == "general")
g <- ggplot(pr2_mismatches, aes(x = primer_label, y = mismatch_pct, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
hjust = 0, vjust = 0)) + labs(x = "Primer set", y = "% of sequences with mismatches",
fill = "Mismatches") + scale_fill_viridis_d(direction = -1, alpha = 0.85) +
guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") +
coord_flip()
print(g)
fig1[[str_c(one_region, "mismatches", sep = " ")]] <- g
}8.4.2 Plot all with panel general and specific
for (specific_one in c("general", "specific")) {
df <- pr2_match_summary_primer_set_long %>% filter(pct_category %in% c("ampli_pct",
"fwd_pct", "rev_pct")) %>% filter(specific == specific_one)
g <- ggplot(df) + geom_col(aes(x = primer_label, y = pct_seq, fill = fct_reorder(pct_category,
pct_category_order)), width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") +
ylab("% of sequences amplified") + scale_fill_manual(name = "", values = c(ampli_pct = "black",
fwd_pct = "grey80", rev_pct = "grey40"), labels = c("Amplicons", "Primer rev",
"Primer fwd")) + ggtitle(str_c(specific_one, " - Percentage of sequences recovered")) +
theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + ylim(0,
100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
legend.box = "horizontal")
print(g)
fig1[[str_c(specific_one, "pct", sep = " ")]] <- g
df <- pr2_match_summary_primer_set %>% filter(!is.nan(ampli_size_mean)) %>% filter(specific ==
specific_one)
g <- ggplot(df) + geom_point(aes(x = primer_label, y = ampli_size_mean), colour = "black") +
geom_errorbar(aes(x = primer_label, ymax = ampli_size_mean + ampli_size_sd,
ymin = ampli_size_mean - ampli_size_sd)) + theme_bw() + xlab("Primer set") +
ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c(specific_one, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 850) +
theme(axis.text.y = element_text(angle = 0, hjust = 0)) + coord_flip()
print(g)
fig1[[str_c(specific_one, "size", sep = " ")]] <- g
}8.5 Mismatches
8.5.1 Number of mismatches
for (specific_one in c("general", "specific")) {
pr2_mismatches <- pr2_match_summary_primer_set %>% tidyr::pivot_longer(names_to = "mismatch_number",
values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>%
filter(specific == specific_one)
g <- ggplot(pr2_mismatches, aes(x = primer_label, y = mismatch_pct, group = primer_set_id,
fill = as.factor(mismatch_number))) + geom_col() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
hjust = 0, vjust = 0)) + labs(x = "Primer set", y = "% of sequences with mismatches",
fill = "Mismatches") + scale_fill_viridis_d(direction = -1, alpha = 0.85) +
guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") +
coord_flip()
print(g)
fig1[[str_c(specific_one, "mismatches", sep = " ")]] <- g
}8.5.2 Position of mismatches
ggplot(pr2_match_final, aes(x = primer_label)) + geom_boxplot(aes(y = fwd_mismatch_primer_position_3prime),
outlier.alpha = 0.3, color = "blue") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, breaks = seq(0,
25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches",
title = "Primer fwd") + facet_wrap(vars(specific), nrow = 2, scales = "free_y")ggplot(pr2_match_final, aes(x = primer_label)) + geom_boxplot(aes(y = rev_mismatch_primer_position_3prime),
outlier.alpha = 0.3, color = "red") + coord_flip() + theme_bw() + theme(axis.text.y = element_text(angle = 0,
vjust = 0, hjust = 0)) + scale_y_continuous(minor_breaks = 0:25, breaks = seq(0,
25, by = 5), limits = c(0, 25)) + labs(x = "Primer set", y = "Position of mismatches",
title = "Primer rev") + facet_wrap(vars(specific), nrow = 2, scales = "free_y")ggplot(filter(pr2_match_final, !is.na(rev_mismatch_primer_position_3prime))) + geom_histogram(aes(x = rev_mismatch_primer_position_3prime),
color = "red", binwidth = 1, stat = "density") + theme_bw() + scale_x_continuous(minor_breaks = 0:30,
breaks = seq(0, 30, by = 5), limits = c(0, 30)) + labs(y = "Primer set", x = "Position of mismatches",
title = "Primer rev") + facet_wrap(vars(primer_label), ncol = ncol) ## By supergroup
Comments
- Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
- Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)
8.5.3 % of Ampli and Amplicon size
fig_supergroup <- list()
for (specific_one in c("general", "specific")) {
pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>%
filter(specific == specific_one) %>% filter((n_seq > 20)) %>% filter(!(supergroup %in%
c("Apusozoa", "Eukaryota_X")))
g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup,
y = ampli_pct, fill = supergroup), position = "dodge") + theme_bw() + coord_flip() +
ylab("% of sequences amplified") + xlab("Supergroup") + ggtitle(str_c(specific_one,
" primers")) + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) +
facet_wrap(~primer_label, scales = "fixed", ncol = ncol) + guides(fill = guide_legend(nrow = 1)) +
theme(legend.position = "top", legend.box = "horizontal") + theme(legend.position = "none")
print(g)
list_label <- str_c("pct", specific_one, sep = " ")
fig_supergroup[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) +
geom_point(aes(x = supergroup, y = ampli_size_mean), colour = "black") +
theme_bw() + coord_flip() + geom_errorbar(aes(x = supergroup, ymax = ampli_size_mean +
ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") +
ylab("Amplicon size (bp)") + ggtitle(str_c(specific_one, " primers")) + geom_hline(yintercept = c(450,
550), linetype = 2) + facet_wrap(~primer_label, scales = "fixed", ncol = ncol)
print(g)
list_label <- str_c("size", specific_one, sep = " ")
fig_supergroup[[list_label]] <- g
}8.5.4 Number of mismatches
pr2_mismatches_sg <- pr2_match_summary_primer_set_sg %>% tidyr::pivot_longer(names_to = "mismatch_number",
values_to = "mismatch_pct", cols = contains("ampli_mismatch"), names_prefix = "ampli_mismatch_") %>%
select(-contains("ampli_mismatch")) %>% mutate(mismatch_number = str_sub(mismatch_number,
1, 1)) %>% mutate(mismatch_number = str_replace(mismatch_number, "5", "5+")) %>%
filter((n_seq > 20)) %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X")))
for (specific_one in c("general", "specific")) {
pr2_mismatches_sg_one <- pr2_mismatches_sg %>% filter(specific == specific_one)
g <- ggplot(pr2_mismatches_sg_one, aes(x = supergroup, y = mismatch_pct, fill = fct_rev(mismatch_number))) +
geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
labs(x = "Group", y = "% of sequences with mismatches", title = str_c(specific_one,
" primers"), fill = "Mismatches") + scale_fill_viridis_d(direction = 1,
alpha = 0.85) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top",
legend.box = "horizontal") + coord_flip() + facet_wrap(vars(primer_label),
ncol = ncol)
print(g)
fig_supergroup[[str_c("mismatches", specific_one, sep = " ")]] <- g
}8.6 By class for autotrophs
fig_class <- list()
for (specific_one in c("general", "specific")) {
pr2_match_summary_filtered_one <- pr2_match_summary_primer_set_class %>% filter(n_seq >
20) %>% filter(division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta",
"Ochrophyta", "Cryptophyta")) %>% filter(specific == specific_one)
g <- ggplot(pr2_match_summary_filtered_one) + geom_col(aes(x = str_c(str_trunc(division,
20, ellipsis = ""), "-", class), y = ampli_pct, fill = class), position = "dodge") +
theme_bw() + coord_flip() + ylab("% of sequences amplified") + xlab("Class") +
theme(axis.text.y = element_text(angle = 0, hjust = 0, vjust = 0)) + scale_fill_viridis_d(option = "inferno") +
ylim(0, 100) + facet_wrap(vars(primer_label), scales = "fixed", ncol = ncol) +
guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", legend.box = "horizontal") +
theme(legend.position = "none")
print(g)
list_label <- str_c("pct", specific_one, sep = " ")
fig_class[[list_label]] <- g
g <- ggplot(filter(pr2_match_summary_filtered_one, !is.nan(ampli_size_mean))) +
geom_point(aes(x = str_c(str_trunc(division, 3, ellipsis = ""), "-", class),
y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(str_trunc(division,
3, ellipsis = ""), "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
ampli_size_sd)) + theme_bw() + coord_flip() + theme(axis.text.y = element_text(angle = 0,
hjust = 0, vjust = 0)) + xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) + ggtitle
# (str_c('Set -', one_primer_set, ' - Amplicon size - Lines correspond to limits
# for Illumina 2x250 and 2x300 respectively') ) +
geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label,
scales = "fixed", ncol = ncol)
print(g)
list_label <- str_c("size", specific_one, sep = " ")
fig_class[[list_label]] <- g
}9 Specific analyses
9.1 Specific et sets for Opisthokonta
- 35 UnNonMet
- 16 Piredda
- 17 Comeau
for (one_primer_set in c(16, 17, 35)) {
pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq >
20) & (supergroup %in% c("Opisthokonta")) & (primer_set_id == one_primer_set))
g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered,
aes(x = str_c(division, "-", class, " - n= ", n_seq), y = ampli_pct), fill = "grey",
position = "dodge") + theme(axis.text.y = element_text(angle = 0, hjust = 0,
vjust = 0)) + theme_bw() + coord_flip() + ylab("% of sequences amplified") +
xlab("Class") + ggtitle(str_c("Set -", one_primer_set, " - % amplified per Class"))
print(g)
g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(division,
"-", class, " - n= ", n_seq), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = str_c(division,
"-", class, " - n= ", n_seq), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean -
ampli_size_sd)) + theme(axis.text.y = element_text(angle = 0, hjust = 0,
vjust = 0)) + coord_flip() + xlab("Class") + ylab("Amplicon size (bp)") +
# scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) +
geom_hline(yintercept = c(450, 550), linetype = 2)
print(g)
}10 Tables
10.1 Table 1 and Supplementary tables
if (gene_selected == "18S_rRNA") file_name_xls = "../output/primers 18S.xlsx"
sheets = list(`Table 1` = primer_sets, `Summary Eukaryotes` = pr2_match_summary_primer_set,
`Summary Supergroup` = pr2_match_summary_primer_set_sg, `Summary Class` = pr2_match_summary_primer_set_class)
excel_style <- openxlsx::createStyle(numFmt = "0.00")
openxlsx::write.xlsx(sheets, file = file_name_xls, zoom = 80, firstActiveRow = 2,
firstActiveCol = 6, colWidths = "auto")11 Figures
11.1 Fig. 1 - V4 and V9 - Amplification % and size
legend_pct <- cowplot::get_legend( fig1[["V4 pct"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
legend_mismatches <- cowplot::get_legend( fig1[["V4 mismatches"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
fig_1 <- cowplot::plot_grid(legend_pct, NULL,legend_mismatches,
fig1[["V4 pct"]] +ggtitle("")+ theme(legend.position="none") + ylab(""),
fig1[["V4 mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab("") + ylab(""),
fig1[["V4 size"]] +ggtitle("") + xlab("") + ylab(""),
fig1[["V9 pct"]] +ggtitle("")+ theme(legend.position="none"),
fig1[["V9 mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab(""),
fig1[["V9 size"]] +ggtitle("") + xlab(""),
labels = c("","","","A","","", "B","",""), label_size = 22, label_x = 0.05,
ncol = 3, nrow = 3, align = "hv",
rel_heights = c(1,20,6), rel_widths = c(12,12,10) )
fig_1
ggsave(plot= fig_1 , filename="../figs/fig_pct_sizes_mismatches_V4_V9.pdf",
width = 12 , height = 12, scale=1.75, units="cm", useDingbats=FALSE)11.2 Fig. 1 - All - Amplification % and size and mismatches
legend_pct <- cowplot::get_legend( fig1[["general pct"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
legend_mismatches <- cowplot::get_legend( fig1[["general mismatches"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(0, 0, 0, 20))
)
fig_1 <- cowplot::plot_grid(legend_pct, legend_mismatches, NULL,
fig1[["general pct"]] +ggtitle("")+ theme(legend.position="none") + ylab(""),
fig1[["general mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab("") + ylab(""),
fig1[["general size"]] +ggtitle("") + xlab("") + ylab(""),
fig1[["specific pct"]] +ggtitle("")+ theme(legend.position="none"),
fig1[["specific mismatches"]] +ggtitle("")+ theme(legend.position="none") + xlab(""),
fig1[["specific size"]] +ggtitle("") + xlab(""),
labels = c("","","", "A","","", "B","",""),
label_size = 12, label_x = 0.05,
ncol = 3, nrow = 3, align = "hv",
rel_heights = c(1,38,26), rel_widths = c(12,12,10) )
fig_111.3 Fig. - Example of V4 and V9
legend_mismatches <- cowplot::get_legend( fig3[["4 mismatches"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
)
legend_size_distri <- cowplot::get_legend( fig3[["4 size_distri"]] +
# create some space to the left of the legend
theme(legend.box.margin = margin(t = 20, r = 0, b = 20, l = 20))
)
fig_sub4 <- cowplot::plot_grid(fig3[["4 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""),
fig3[["4 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
nrow = 2,
align = "v" )
fig_sub29 <- cowplot::plot_grid(fig3[["29 mismatches positions fwd"]] + theme(legend.position="none") + ggtitle("fwd")+ xlab(""),
fig3[["29 mismatches positions rev"]] + theme(legend.position="none") + ggtitle("rev"),
nrow = 2,
align = "v" )
fig <- cowplot::plot_grid(legend_mismatches,
legend_size_distri,
fig3[["4 mismatches"]] + theme(legend.position="none"),
fig_sub4,
fig3[["4 size_distri"]] + theme(legend.position="none"),
fig3[["4 size"]],
fig3[["29 mismatches"]]+ theme(legend.position="none"),
fig_sub29,
fig3[["29 size_distri"]]+ theme(legend.position="none"),
fig3[["29 size"]],
labels = c("","","A - Primer V4 #4","","","", "B - Primer V9 #29","","",""),
label_y = 1.05, hjust = -0.1, scale=0.95,
ncol = 2, nrow = 5, rel_heights = c(4,20,20,20,20),
align = "h" )
fig11.4 Fig. 3 - Supergroup analysis
row_height = 5
for (specific_one in c("general", "specific")) {
fig_3 <- cowplot::plot_grid(fig_supergroup[[str_c("mismatches", specific_one,
sep = " ")]], NULL, fig_supergroup[[str_c("size", specific_one, sep = " ")]],
NULL, labels = c("A", "", "B", ""), ncol = 2, nrow = 2, align = "v", rel_widths = c(13,
0.2))
fig_3
height <- row_height * (ceiling(n_primers[[specific_one]]/ncol))
ggsave(plot = fig_3, filename = str_c("../figs/fig_supergroup_", specific_one,
".pdf"), width = 14, height = height, scale = 1.75, units = "cm", useDingbats = FALSE)
}11.5 Fig. 4 - Class analysis
row_height = 3.5
for (specific_one in c("general", "specific")) {
fig_4 <- cowplot::plot_grid(fig_class[[str_c("pct", specific_one, sep = " ")]],
ncol = 1, nrow = 1, align = "v")
fig_4
height <- row_height * (trunc(n_primers[[specific_one]]/ncol))
ggsave(plot = fig_4, filename = str_c("../figs/fig_class_", specific_one, ".pdf"),
width = 7, height = height, scale = 3, units = "cm", useDingbats = FALSE)
}